> Agent Memory Systems

Budding

planted Jan 8, 2026tended Jan 8, 2026

#ai-agents#memory#vector-databases#context-management

Agent Memory Systems

🌿 Budding note — memory architectures for intelligent agents.

Why Memory Matters

Agents need memory to:

Maintain context across conversations
Learn from experience and improve over time
Recall past interactions for personalization
Avoid repeating mistakes
Build knowledge incrementally

Related: AI Agents Fundamentals for core concepts

Types of Memory

1. Short-Term Memory (Working Memory)

Purpose: Immediate conversation context

class ShortTermMemory:
    """Sliding window of recent messages"""
    def __init__(self, max_messages: int = 10):
        self.messages = []
        self.max_messages = max_messages

    def add(self, role: str, content: str):
        """Add message to memory"""
        self.messages.append({
            "role": role,
            "content": content,
            "timestamp": time.time()
        })

        # Keep only recent messages
        if len(self.messages) > self.max_messages:
            self.messages.pop(0)

    def get_context(self) -> list[dict]:
        """Get messages for LLM"""
        return [
            {"role": msg["role"], "content": msg["content"]}
            for msg in self.messages
        ]

    def clear(self):
        """Clear working memory"""
        self.messages = []

Usage:

memory = ShortTermMemory(max_messages=10)

memory.add("user", "What's the weather in Tokyo?")
memory.add("assistant", "It's 18°C and partly cloudy in Tokyo")
memory.add("user", "What about Paris?")

# Get context for next LLM call
messages = memory.get_context()

2. Long-Term Memory (Vector Store)

Purpose: Persistent knowledge retrieval

from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer

class LongTermMemory:
    """Vector database for semantic search"""
    def __init__(self, collection_name: str = "memories"):
        self.client = QdrantClient(":memory:")  # Or remote URL
        self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
        self.collection = collection_name

        # Create collection
        self.client.create_collection(
            collection_name=self.collection,
            vectors_config=VectorParams(
                size=384,  # Model embedding size
                distance=Distance.COSINE
            )
        )

    def store(self, text: str, metadata: dict = None):
        """Store memory with semantic embedding"""
        vector = self.encoder.encode(text).tolist()

        point = PointStruct(
            id=hash(text) % (10 ** 8),  # Simple ID generation
            vector=vector,
            payload={
                "text": text,
                "timestamp": time.time(),
                **(metadata or {})
            }
        )

        self.client.upsert(
            collection_name=self.collection,
            points=[point]
        )

    def recall(self, query: str, limit: int = 5) -> list[dict]:
        """Retrieve relevant memories"""
        query_vector = self.encoder.encode(query).tolist()

        results = self.client.search(
            collection_name=self.collection,
            query_vector=query_vector,
            limit=limit
        )

        return [
            {
                "text": hit.payload["text"],
                "score": hit.score,
                "timestamp": hit.payload["timestamp"],
                **{k: v for k, v in hit.payload.items()
                   if k not in ["text", "timestamp"]}
            }
            for hit in results
        ]

    def forget(self, query: str, threshold: float = 0.9):
        """Remove similar memories"""
        results = self.recall(query, limit=10)

        ids_to_delete = [
            hit["id"] for hit in results
            if hit["score"] > threshold
        ]

        if ids_to_delete:
            self.client.delete(
                collection_name=self.collection,
                points_selector=ids_to_delete
            )

Usage:

ltm = LongTermMemory()

# Store experiences
ltm.store("User prefers technical explanations", {"category": "preference"})
ltm.store("Previous bug: SQL injection in login form", {"category": "bug"})
ltm.store("Successfully deployed to production on 2026-01-05", {"category": "milestone"})

# Recall relevant memories
memories = ltm.recall("How should I explain this?", limit=3)
# Returns: [{"text": "User prefers technical explanations", ...}]

3. Episodic Memory

Purpose: Remember specific events and conversations

from dataclasses import dataclass
from datetime import datetime
from typing import List

@dataclass
class Episode:
    """Single conversation episode"""
    id: str
    timestamp: datetime
    messages: List[dict]
    summary: str
    outcome: str  # "success", "failure", "partial"
    tags: List[str]

class EpisodicMemory:
    """Store and retrieve conversation episodes"""
    def __init__(self):
        self.episodes: List[Episode] = []
        self.vector_store = LongTermMemory(collection_name="episodes")

    def store_episode(
        self,
        messages: List[dict],
        summary: str,
        outcome: str,
        tags: List[str] = None
    ):
        """Save completed episode"""
        episode = Episode(
            id=str(uuid.uuid4()),
            timestamp=datetime.now(),
            messages=messages,
            summary=summary,
            outcome=outcome,
            tags=tags or []
        )

        self.episodes.append(episode)

        # Store in vector DB for semantic search
        self.vector_store.store(
            text=summary,
            metadata={
                "episode_id": episode.id,
                "outcome": outcome,
                "tags": tags
            }
        )

    def recall_similar_episodes(self, query: str, limit: int = 3) -> List[Episode]:
        """Find similar past episodes"""
        memories = self.vector_store.recall(query, limit=limit)

        return [
            self._get_episode_by_id(mem["episode_id"])
            for mem in memories
        ]

    def _get_episode_by_id(self, episode_id: str) -> Episode:
        """Retrieve full episode"""
        return next(
            (ep for ep in self.episodes if ep.id == episode_id),
            None
        )

    def get_success_rate(self, tag: str = None) -> float:
        """Calculate success rate for tasks"""
        episodes = self.episodes
        if tag:
            episodes = [ep for ep in episodes if tag in ep.tags]

        if not episodes:
            return 0.0

        successful = sum(1 for ep in episodes if ep.outcome == "success")
        return successful / len(episodes)

Usage:

episodic = EpisodicMemory()

# After completing a task
episodic.store_episode(
    messages=conversation_history,
    summary="User asked for weather in Tokyo, successfully fetched and responded",
    outcome="success",
    tags=["weather", "api_call", "tokyo"]
)

# Learn from past experiences
similar = episodic.recall_similar_episodes("How to fetch weather data?")
# Returns episodes about weather queries

# Check performance
success_rate = episodic.get_success_rate(tag="api_call")
print(f"API call success rate: {success_rate:.1%}")

Memory Integration in Agents

Hierarchical Memory Architecture

class AgentWithMemory:
    """Agent with integrated memory systems"""
    def __init__(self):
        self.short_term = ShortTermMemory(max_messages=10)
        self.long_term = LongTermMemory()
        self.episodic = EpisodicMemory()

    async def process(self, user_message: str) -> str:
        """Process message with memory"""
        # 1. Add to short-term memory
        self.short_term.add("user", user_message)

        # 2. Recall relevant long-term memories
        relevant_memories = self.long_term.recall(user_message, limit=3)

        # 3. Find similar past episodes
        similar_episodes = self.episodic.recall_similar_episodes(user_message, limit=2)

        # 4. Build enriched context
        context = self._build_context(
            short_term=self.short_term.get_context(),
            long_term=relevant_memories,
            episodes=similar_episodes
        )

        # 5. Generate response
        response = await self.llm.generate(context)

        # 6. Update memories
        self.short_term.add("assistant", response)
        self.long_term.store(f"Q: {user_message}\nA: {response}")

        return response

    def _build_context(self, short_term, long_term, episodes) -> str:
        """Combine memory types into prompt"""
        context = "# Current Conversation\n"
        for msg in short_term:
            context += f"{msg['role']}: {msg['content']}\n"

        if long_term:
            context += "\n# Relevant Past Information\n"
            for mem in long_term:
                context += f"- {mem['text']}\n"

        if episodes:
            context += "\n# Similar Past Interactions\n"
            for ep in episodes:
                context += f"- {ep.summary} (outcome: {ep.outcome})\n"

        return context

Memory Optimization

1. Summarization for Context Window

class SummarizingMemory:
    """Compress old messages to fit context"""
    def __init__(self, llm, max_tokens: int = 4000):
        self.llm = llm
        self.max_tokens = max_tokens
        self.messages = []
        self.summary = ""

    async def add(self, role: str, content: str):
        """Add message, summarize if needed"""
        self.messages.append({"role": role, "content": content})

        # Check token count
        current_tokens = self._estimate_tokens()

        if current_tokens > self.max_tokens:
            await self._summarize_and_compress()

    async def _summarize_and_compress(self):
        """Summarize old messages"""
        # Keep last 5 messages, summarize the rest
        to_summarize = self.messages[:-5]
        recent = self.messages[-5:]

        if to_summarize:
            summary_text = "\n".join(
                f"{msg['role']}: {msg['content']}"
                for msg in to_summarize
            )

            new_summary = await self.llm.generate(
                f"Summarize this conversation:\n{summary_text}"
            )

            self.summary += f"\n{new_summary}"
            self.messages = recent

    def _estimate_tokens(self) -> int:
        """Rough token estimation"""
        total_chars = sum(len(msg["content"]) for msg in self.messages)
        total_chars += len(self.summary)
        return total_chars // 4  # ~4 chars per token

    def get_context(self) -> list[dict]:
        """Get compressed context"""
        context = []

        if self.summary:
            context.append({
                "role": "system",
                "content": f"Previous conversation summary:\n{self.summary}"
            })

        context.extend(self.messages)
        return context

2. Tiered Storage

class TieredMemory:
    """Hot/warm/cold memory tiers"""
    def __init__(self):
        self.hot = []  # In-memory, instant access
        self.warm = LongTermMemory()  # Vector DB, fast
        self.cold = {}  # S3/disk, slow but cheap

    def store(self, memory: dict, tier: str = "hot"):
        """Store in appropriate tier"""
        if tier == "hot":
            self.hot.append(memory)
        elif tier == "warm":
            self.warm.store(memory["text"], memory.get("metadata"))
        elif tier == "cold":
            memory_id = str(uuid.uuid4())
            self.cold[memory_id] = memory
            # In production: upload to S3
            # boto3.client('s3').put_object(...)

    def recall(self, query: str) -> list:
        """Search across tiers"""
        # Check hot tier first
        hot_results = [m for m in self.hot if query.lower() in m["text"].lower()]

        # Then warm tier
        warm_results = self.warm.recall(query, limit=5)

        # Combine results
        return hot_results + warm_results

    def promote_to_hot(self, memory_id: str):
        """Move frequently accessed memories to hot tier"""
        if memory_id in self.cold:
            memory = self.cold.pop(memory_id)
            self.hot.append(memory)

3. Selective Retention

class SelectiveMemory:
    """Only remember important information"""
    def __init__(self, llm):
        self.llm = llm
        self.memory = LongTermMemory()

    async def should_remember(self, interaction: str) -> bool:
        """Decide if interaction is worth remembering"""
        prompt = f"""Rate the importance of remembering this interaction (0-10):
{interaction}

Consider:
- Does it contain facts the user taught me?
- Does it reveal user preferences?
- Is it relevant for future interactions?
- Is it just small talk?

Respond with just a number 0-10."""

        score = await self.llm.generate(prompt)
        return int(score.strip()) >= 7

    async def store_if_important(self, interaction: str, metadata: dict = None):
        """Selectively store memories"""
        if await self.should_remember(interaction):
            self.memory.store(interaction, metadata)
            return True
        return False

Memory Challenges

1. Context Window Limits

Problem: LLMs have finite context windows Solution: Summarization + selective retrieval

# Instead of full history
messages = all_messages  # Might exceed context

# Use compressed context
relevant = memory.recall(current_query, limit=5)
recent = messages[-10:]  # Last 10 messages
context = relevant + recent  # Fits in window

2. Memory Consistency

Problem: Conflicting information in memory Solution: Versioning and conflict resolution

class VersionedMemory:
    def store(self, key: str, value: str):
        """Store with version tracking"""
        if key in self.memory:
            self.memory[key]["versions"].append({
                "value": value,
                "timestamp": time.time()
            })
        else:
            self.memory[key] = {
                "current": value,
                "versions": [{"value": value, "timestamp": time.time()}]
            }

    def get_latest(self, key: str) -> str:
        """Get most recent version"""
        if key in self.memory:
            return self.memory[key]["versions"][-1]["value"]
        return None

3. Privacy and Forgetting

Problem: Users want data deleted Solution: Right to be forgotten

class PrivacyAwareMemory:
    def forget_user_data(self, user_id: str):
        """Delete all memories for user"""
        # Delete from vector store
        self.vector_store.client.delete(
            collection_name=self.collection,
            points_selector={
                "filter": {
                    "must": [{"key": "user_id", "match": {"value": user_id}}]
                }
            }
        )

        # Delete episodes
        self.episodes = [
            ep for ep in self.episodes
            if ep.metadata.get("user_id") != user_id
        ]

Related: Agent Security Considerations

Production Patterns

1. Distributed Memory

import redis

class DistributedMemory:
    """Share memory across agent instances"""
    def __init__(self, redis_url: str):
        self.redis = redis.from_url(redis_url)

    def store(self, key: str, value: dict, ttl: int = 3600):
        """Store with expiration"""
        self.redis.setex(
            key,
            ttl,
            json.dumps(value)
        )

    def recall(self, key: str) -> dict:
        """Retrieve from shared memory"""
        value = self.redis.get(key)
        return json.loads(value) if value else None

2. Async Memory Operations

class AsyncMemory:
    """Non-blocking memory operations"""
    async def store_async(self, text: str, metadata: dict):
        """Store without blocking agent"""
        await asyncio.create_task(
            self._background_store(text, metadata)
        )

    async def _background_store(self, text: str, metadata: dict):
        """Background storage task"""
        vector = await self.encoder.encode_async(text)
        await self.vector_store.upsert_async(vector, metadata)

Related: Production Agent Deployment